In [23]:
import pandas as pd

file_path = r'D:\Bootcamp\Main\3_spotify_5000_songs.csv'
spotify5k_df = pd.read_csv(file_path)
spotify5k_df = spotify5k_df.rename(columns=lambda x: x.strip())
spotify5k_df.info()
spotify5k_df     
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5235 entries, 0 to 5234
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5235 non-null   int64  
 1   name              5235 non-null   object 
 2   artist            5235 non-null   object 
 3   danceability      5235 non-null   float64
 4   energy            5235 non-null   float64
 5   key               5235 non-null   int64  
 6   loudness          5235 non-null   float64
 7   mode              5235 non-null   int64  
 8   speechiness       5235 non-null   float64
 9   acousticness      5235 non-null   float64
 10  instrumentalness  5235 non-null   float64
 11  liveness          5235 non-null   float64
 12  valence           5235 non-null   float64
 13  tempo             5235 non-null   float64
 14  type              5235 non-null   object 
 15  duration_ms       5235 non-null   int64  
 16  time_signature    5235 non-null   int64  
 17  id                5235 non-null   object 
 18  html              5235 non-null   object 
dtypes: float64(9), int64(5), object(5)
memory usage: 777.2+ KB
Out[23]:
Unnamed: 0 name artist danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo type duration_ms time_signature id html
0 0 Se Eu Quiser Falar Com Deus ... Gilberto Gil 0.6580 0.25900 11 -13.141 0 0.0705 0.694 0.000059 0.9750 0.3060 110.376 256213 4 1n7JnwviZ7zf0LR1tcGFq7 https://open.spotify.com/track/1n7JnwviZ7zf0LR...
1 1 Saudade De Bahia ... Antônio Carlos Jobim 0.7420 0.39900 2 -12.646 1 0.0346 0.217 0.000002 0.1070 0.6930 125.039 191867 4 5QGM1U0eCYrQuwSJwTm5Zq https://open.spotify.com/track/5QGM1U0eCYrQuwS...
2 2 Canta Canta, Minha Gente ... Martinho Da Vila 0.8510 0.73000 2 -11.048 1 0.3470 0.453 0.000063 0.1240 0.9050 93.698 152267 4 0NLIFSZxPzQhCwnkn5PJYs https://open.spotify.com/track/0NLIFSZxPzQhCwn...
3 3 Mulher Eu Sei ... Chico César 0.7050 0.05020 4 -18.115 1 0.0471 0.879 0.000041 0.3860 0.5240 106.802 186227 4 3mXqOdlLE1k67WsAxryPFs https://open.spotify.com/track/3mXqOdlLE1k67Ws...
4 4 Rosa Morena ... Kurt Elling 0.6510 0.11900 6 -19.807 1 0.0380 0.916 0.000343 0.1040 0.4020 120.941 273680 4 7bSzjzjTkWT2CkIPPdp0eA https://open.spotify.com/track/7bSzjzjTkWT2CkI...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5230 5230 1812 Festival Overture, Op. 49: 1812 Overture,... Pyotr Ilyich Tchaikovsky 0.2020 0.06940 3 -23.390 1 0.0473 0.917 0.715000 0.0832 0.0655 87.906 995000 4 1aEhYlZtypmipA06SDJ4U3 https://open.spotify.com/track/1aEhYlZtypmipA0...
5231 5231 Winter Fragments pour ensemble instrumental, s... Tristan Murail 0.1880 0.10100 6 -21.873 1 0.0442 0.855 0.426000 0.0694 0.0355 83.134 855000 4 1Gfqe7TAKklmuZf6hxsH6h https://open.spotify.com/track/1Gfqe7TAKklmuZf...
5232 5232 Schoenberg: 5 Orchestral Pieces, Op. 16: No. 3... Arnold Schoenberg 0.0596 0.00093 9 -42.959 1 0.0434 0.951 0.969000 0.0772 0.0344 71.573 238187 3 2XNwnFrdMDpismp0VUZ7cU https://open.spotify.com/track/2XNwnFrdMDpismp...
5233 5233 Serenade For Strings In E, Op.22, B. 52: 1. Mo... Antonín Dvořák 0.1330 0.02080 4 -29.443 1 0.0419 0.883 0.505000 0.1110 0.0591 67.109 314307 4 7ucDwgMtE3YJtEfTbuRhy0 https://open.spotify.com/track/7ucDwgMtE3YJtEf...
5234 5234 Ravel: Boléro, M. 81 ... Maurice Ravel 0.3540 0.04870 5 -24.568 1 0.0323 0.783 0.484000 0.2840 0.2530 64.120 948787 3 7E1ErYYCn0lYjHODZ1qGuB https://open.spotify.com/track/7E1ErYYCn0lYjHO...

5235 rows × 19 columns

In [2]:
spotify5k_df.axes
Out[2]:
[RangeIndex(start=0, stop=5235, step=1),
 Index(['Unnamed: 0', 'name', 'artist', 'danceability', 'energy', 'key',
        'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
        'liveness', 'valence', 'tempo', 'type', 'duration_ms', 'time_signature',
        'id', 'html'],
       dtype='object')]
In [3]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd

features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

X = spotify5k_df[features]

wcss = []

k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Curve for KMeans Clustering')
plt.xticks(k_range)
plt.show()

wcss_data = pd.DataFrame({'Number of Clusters (k)': k_range, 'WCSS': wcss})
print("WCSS Data:")
wcss_data
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
WCSS Data:
Out[3]:
Number of Clusters (k) WCSS
0 1 4.481778e+06
1 2 1.744851e+06
2 3 8.550621e+05
3 4 5.981134e+05
4 5 4.404780e+05
5 6 3.650132e+05
6 7 3.213276e+05
7 8 2.811246e+05
8 9 2.543345e+05
9 10 2.315212e+05
In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer

# Define the features
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Extract features from the dataframe (assuming spotify5k_df is defined elsewhere)
# X = spotify5k_df[features]

# Number of clusters
k = 4  

# List of scaler names for plotting and labeling
scaler_names = ['Raw', 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'PowerTransformer']

# Dictionary to store Within-Cluster-Sum-of-Squares (WCSS) for each scaler
wcss_dict = {}

# Define colors for better readability
colors = sns.color_palette("tab10")

# Dictionary to store cluster centers for each scaler
cluster_centers = {}

# Iterate over each scaler
for i, scaler in enumerate([None, StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler(), PowerTransformer()]):
    # Scale the features if scaler is not None
    if scaler is not None:
        X_scaled = scaler.fit_transform(X)
        scaler_name = scaler_names[i]
    else:
        X_scaled = X
        scaler_name = 'Raw Data'
    
    # Initialize KMeans clustering
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    
    # Calculate WCSS
    wcss = kmeans.inertia_
    wcss_dict[scaler_name] = wcss
    
    # Store cluster centers
    cluster_centers[scaler_name] = kmeans.cluster_centers_

    # Plot radar chart for cluster centers
    plt.figure(figsize=(10, 6))
    angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False).tolist()
    angles += angles[:1]  # Ensure closed loop
    ax = plt.subplot(111, polar=True)

    # Plot each cluster's centroid
    for idx, center in enumerate(kmeans.cluster_centers_):
        values = np.round(np.concatenate((center, [center[0]])), 2)  # Round to two decimal points
        ax.plot(angles, values, marker='o', linestyle='-', color=colors[idx], linewidth=2, label=f'Cluster {idx+1}')
        # Fill the area enclosed by each cluster's centroid with a light blue shade
        ax.fill(angles, values, color=colors[idx], alpha=0.25)

    # Set the labels for each axis
    ax.set_yticklabels([])
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(features, fontsize=10)

    plt.title(f'Radar Chart for Cluster Centers ({scaler_name})', loc='left', fontsize=12, pad=20)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=k)
    plt.show()

    # Create DataFrame to display WCSS for the current scaler
    scaler_df = pd.DataFrame(list(wcss_dict.items()), columns=['Scaler', 'WCSS'])

    # Display the table for the current scaler
    print(f"{scaler_name} chart")
    print(f"Table for {scaler_name}:")
    display(scaler_df)

# Find the best scaler based on the minimum WCSS
best_scaler = min(wcss_dict, key=wcss_dict.get)

print(f"\nBased on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is {best_scaler}. Using this scaler helps in minimizing the WCSS, indicating better cluster formation.")
print(f"\nChoosing an appropriate scaler is crucial as it affects the clustering results. With {k} clusters, it is recommended to use the {best_scaler} scaler to create the clusters.")

# Add explanation points based on the WCSS score
print("\nAdditional points: \n1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. \n2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. \n3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.")
Raw Data chart
Table for Raw Data:
Scaler WCSS
0 Raw Data 598113.363521
StandardScaler chart
Table for StandardScaler:
Scaler WCSS
0 Raw Data 598113.363521
1 StandardScaler 24522.213055
MinMaxScaler chart
Table for MinMaxScaler:
Scaler WCSS
0 Raw Data 598113.363521
1 StandardScaler 24522.213055
2 MinMaxScaler 998.931204
RobustScaler chart
Table for RobustScaler:
Scaler WCSS
0 Raw Data 598113.363521
1 StandardScaler 24522.213055
2 MinMaxScaler 998.931204
3 RobustScaler 16081.887091
MaxAbsScaler chart
Table for MaxAbsScaler:
Scaler WCSS
0 Raw Data 598113.363521
1 StandardScaler 24522.213055
2 MinMaxScaler 998.931204
3 RobustScaler 16081.887091
4 MaxAbsScaler 999.866939
PowerTransformer chart
Table for PowerTransformer:
Scaler WCSS
0 Raw Data 598113.363521
1 StandardScaler 24522.213055
2 MinMaxScaler 998.931204
3 RobustScaler 16081.887091
4 MaxAbsScaler 999.866939
5 PowerTransformer 24975.059530
Based on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is MinMaxScaler. Using this scaler helps in minimizing the WCSS, indicating better cluster formation.

Choosing an appropriate scaler is crucial as it affects the clustering results. With 4 clusters, it is recommended to use the MinMaxScaler scaler to create the clusters.

Additional points: 
1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. 
2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. 
3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.
In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd

features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = spotify5k_df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

elbow_df = pd.DataFrame({'Number of Clusters': range(1, 11), 'WCSS': wcss})
elbow_df

silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal Number of Clusters')
plt.show()

silhouette_df = pd.DataFrame({'Number of Clusters': range(2, 11), 'Silhouette Score': silhouette_scores})
silhouette_df
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
Out[5]:
Number of Clusters Silhouette Score
0 2 0.333466
1 3 0.270886
2 4 0.276123
3 5 0.267038
4 6 0.247665
5 7 0.215942
6 8 0.213132
7 9 0.196396
8 10 0.187944
In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()

plt.plot(range(1, len(explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by Principal Components')
plt.grid(False)
plt.show()

num_components = 5
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)

num_clusters = 4 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)
labels = kmeans.labels_

spotify5k_df['Cluster'] = labels
cluster_counts = spotify5k_df['Cluster'].value_counts()
print("Cluster Counts:\n", cluster_counts)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
Cluster Counts:
 Cluster
2    2602
3    1343
0     878
1     412
Name: count, dtype: int64
In [7]:
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(spotify5k_df[features])

kmeans = KMeans(n_clusters=4, random_state=42)
spotify5k_df['Cluster'] = kmeans.fit_predict(X)

numeric_columns = spotify5k_df.select_dtypes(include=['float64', 'int64'])
cluster_means = spotify5k_df.groupby('Cluster')[numeric_columns.columns].mean()
print("Cluster Characteristics (Mean Feature Values):\n")
cluster_means

import matplotlib.pyplot as plt

plt.scatter(X[:, 0], X[:, 1], c=spotify5k_df['Cluster'], cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clusters of Songs')
plt.colorbar(label='Cluster')
plt.show()

for feature in features:
    plt.figure(figsize=(8, 6))
    for cluster_id in range(4):
        cluster_data = spotify5k_df[spotify5k_df['Cluster'] == cluster_id][feature]
        plt.hist(cluster_data, bins=20, alpha=0.6, label=f'Cluster {cluster_id}')
    plt.title(f'Distribution of {feature} by Cluster')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
Cluster Characteristics (Mean Feature Values):

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, spotify5k_df['Cluster'], test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.9684813753581661
In [9]:
spotify5k_df.dtypes
Out[9]:
Unnamed: 0            int64
name                 object
artist               object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
duration_ms           int64
time_signature        int64
id                   object
html                 object
Cluster               int32
dtype: object
In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define a function to normalize tempo
def normalize_tempo(tempo):
    max_tempo = spotify5k_df['tempo'].max()
    min_tempo = spotify5k_df['tempo'].min()
    return (tempo - min_tempo) / (max_tempo - min_tempo)

# Apply the normalization function to the tempo column
spotify5k_df['normalized_tempo'] = spotify5k_df['tempo'].apply(normalize_tempo)

cluster_names = {
    0: 'Serene Sounds',
    1: 'Pulsating Rhythms',
    2: 'Tranquil Tunes',
    3: 'Melancholic Melodies',
}

# Use 'normalized_tempo' instead of 'tempo' in cluster statistics
cluster_statistics = spotify5k_df.groupby('Cluster').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'valence': 'mean',
    'normalized_tempo': 'mean',  # Use 'normalized_tempo' instead of 'tempo'
    'acousticness': 'mean',
    'speechiness': 'mean'
})

cluster_explanations = {
    0: f"Songs with serene and calming vibes characterized by high valence ({cluster_statistics.loc[0, 'valence']:.2f}) and moderate tempo ({cluster_statistics.loc[0, 'normalized_tempo']:.2f}).",
    1: f"Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy ({cluster_statistics.loc[1, 'energy']:.2f}) and tempo ({cluster_statistics.loc[1, 'normalized_tempo']:.2f}).",
    2: f"Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence ({cluster_statistics.loc[2, 'valence']:.2f}) and tempo ({cluster_statistics.loc[2, 'normalized_tempo']:.2f}).",
    3: f"Melancholic tunes with low valence ({cluster_statistics.loc[3, 'valence']:.2f}) and a somber atmosphere, often featuring high acousticness ({cluster_statistics.loc[3, 'acousticness']:.2f}).",
}

for cluster_id, name in cluster_names.items():
    print(f"Cluster {cluster_id} ({name}): {cluster_explanations[cluster_id]}")
    display(cluster_statistics.loc[[cluster_id]])
    
    plt.figure(figsize=(10, 6))
    sns.set(style="white")  # Remove grid
    sns.barplot(x=cluster_statistics.columns, y=cluster_statistics.loc[cluster_id].values, palette="magma")
    plt.title(f'Cluster {cluster_id} - {name} Features')
    plt.xticks(rotation=45)
    
    for index, value in enumerate(cluster_statistics.loc[cluster_id]):
        plt.text(index, value, f'{value:.2f}', ha='center', va='bottom')
    
    plt.show()
    print('\n')
Cluster 0 (Serene Sounds): Songs with serene and calming vibes characterized by high valence (0.61) and moderate tempo (0.56).
danceability energy valence normalized_tempo acousticness speechiness
Cluster
0 0.645645 0.684244 0.611999 0.564491 0.236267 0.056894

Cluster 1 (Pulsating Rhythms): Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy (0.15) and tempo (0.49).
danceability energy valence normalized_tempo acousticness speechiness
Cluster
1 0.354487 0.146981 0.2049 0.491445 0.913782 0.045184

Cluster 2 (Tranquil Tunes): Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence (0.58) and tempo (0.58).
danceability energy valence normalized_tempo acousticness speechiness
Cluster
2 0.683221 0.738124 0.579874 0.577067 0.17778 0.267501

Cluster 3 (Melancholic Melodies): Melancholic tunes with low valence (0.23) and a somber atmosphere, often featuring high acousticness (0.01).
danceability energy valence normalized_tempo acousticness speechiness
Cluster
3 0.288293 0.904954 0.225855 0.57095 0.014879 0.103418

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))

for cluster_id, name in cluster_names.items():
    plt.scatter(cluster_statistics.loc[cluster_id, 'normalized_tempo'], 
                cluster_statistics.loc[cluster_id, 'valence'], 
                label=name, 
                s=200, 
                alpha=0.7)  
    
    plt.text(cluster_statistics.loc[cluster_id, 'normalized_tempo'], 
             cluster_statistics.loc[cluster_id, 'valence'], 
             f"{name}\n{cluster_explanations[cluster_id]}", 
             fontsize=10, 
             ha='center', 
             va='center', 
             wrap=True)  

plt.xlabel('Normalized Tempo', fontsize=12)
plt.ylabel('Valence', fontsize=12)
plt.title('Cluster Analysis based on Tempo and Valence', fontsize=14)
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=2, fontsize=10)
plt.grid(False)

plt.margins(0.05)
plt.show()

# Explanation
print("\nExplanation:")
print("We chose to plot tempo and valence as they are two key features that determine the mood of a song.")
print("Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content.")
print("By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics.")

# Conclusion
print("\nConclusion:")
print("Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics.")
print("Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.")
Explanation:
We chose to plot tempo and valence as they are two key features that determine the mood of a song.
Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content.
By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics.

Conclusion:
Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics.
Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.
In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

spotify5k_df['text'] = spotify5k_df['name'] + ' ' + spotify5k_df['artist']

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(spotify5k_df['text'])

vocab = vectorizer.get_feature_names_out()

word_counts = pd.DataFrame(X.toarray(), columns=vocab)
word_counts['Cluster'] = spotify5k_df['Cluster']

cluster_names = {
    0: 'Relaxing Vibes',
    1: 'Energetic Beats',
    2: 'Chill Out',
    3: 'Melancholic Melodies'
}

for cluster_id in range(len(word_counts['Cluster'].unique())):
    words_in_cluster = word_counts[word_counts['Cluster'] == cluster_id].drop('Cluster', axis=1)
    word_freq = words_in_cluster.sum().to_dict()

    wordcloud = WordCloud(width=800, height=400, background_color='white', prefer_horizontal=0.9).generate_from_frequencies(word_freq)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
    plt.axis('off')
    plt.show()

    top_words = words_in_cluster.sum().sort_values(ascending=False).head(10)
    sns.set_palette('bright')
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_words.values, y=top_words.index)
    plt.title(f'Top 10 Words in {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    for i, (word, count) in enumerate(zip(top_words.index, top_words.values)):
        plt.text(count, i, f' {word} ({count})', fontsize=10, style='italic', va='center')
    plt.show()
In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist
from warnings import simplefilter

# Ignore future warnings
simplefilter(action='ignore', category=FutureWarning)

# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo']

# Scale the features
try:
    scaler = StandardScaler()
    X = scaler.fit_transform(spotify5k_df[features])
except KeyError:
    print("Error: Features not found in the dataset.")

# Use PCA to determine the number of clusters
try:
    pca = PCA(n_components=len(features))
    X_pca = pca.fit_transform(X)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
except ValueError:
    print("Error: Not enough features for PCA.")

# Find optimal number of clusters using silhouette score
best_score = -1
best_k = -1
for k in range(2, 11):
    try:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_pca)
        silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_k = k
    except ValueError:
        print("Error: Unable to find optimal number of clusters.")

# Train KMeans with optimal number of clusters
try:
    kmeans = KMeans(n_clusters=best_k, random_state=42)
    kmeans.fit(X_pca)
    spotify5k_df['cluster'] = kmeans.labels_
except ValueError:
    print("Error: Unable to train KMeans model.")

# Function to calculate diversity of recommended songs
def calculate_diversity(recommended_songs):
    try:
        recommended_features = recommended_songs[features].to_numpy()
        cosine_distances = pdist(recommended_features, metric='cosine')
        avg_cosine_distance = np.mean(cosine_distances)
        diversity = 1 - avg_cosine_distance
        return diversity
    except KeyError:
        print("Error: Features not found in recommended songs.")

# Function to recommend songs from a given cluster
def recommend_songs(cluster_id, num_songs=5):
    try:
        cluster_data = spotify5k_df[spotify5k_df['cluster'] == cluster_id]
        recommended_songs = cluster_data.sample(min(num_songs, len(cluster_data)))
        return recommended_songs
    except KeyError:
        print("Error: Cluster ID not found.")

# Visualize clusters
def visualize_clusters(X_pca, labels, centroids):
    plt.figure(figsize=(10, 6))
    for i in range(len(np.unique(labels))):
        plt.scatter(X_pca[labels == i, 0], X_pca[labels == i, 1], label=f'Cluster {i}')
    plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='black', label='Centroids')
    plt.title('Clusters')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.show()

# Visualize explained variance ratio
def visualize_variance(pca):
    plt.figure(figsize=(8, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA Explained Variance Ratio')
    plt.grid(True)
    plt.show()

# Visualize silhouette scores
def visualize_silhouette_scores(scores):
    plt.figure(figsize=(8, 6))
    plt.plot(range(2, 11), scores, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score vs Number of Clusters')
    plt.grid(True)
    plt.show()

# Example: Recommend songs from each cluster and calculate diversity
try:
    print("**Playlist Recommendations and Diversity Scores:**")
    playlists = []
    for cluster_id in range(best_k):
        recommended_songs = recommend_songs(cluster_id)
        playlist_name = f"Playlist {cluster_id + 1}: {', '.join(recommended_songs['name'].tolist())}"
        playlists.append({'Name': playlist_name, 'Diversity': calculate_diversity(recommended_songs)})
    playlists_df = pd.DataFrame(playlists)
    display(playlists_df)

except TypeError:
    print("Error: Unable to calculate diversity.")

# Visualizations
print("\n**Visualizations:**")
visualize_clusters(X_pca, kmeans.labels_, kmeans.cluster_centers_)
visualize_variance(pca)
visualize_silhouette_scores([silhouette_score(X_pca, kmeans.labels_) for k in range(2, 11)])

# Answers to questions
print("\n**Answers to Questions:**")
print("\n**How did you create your prototype?**")
print("The prototype was created using Python with the scikit-learn library for machine learning algorithms.\n")

print("**How many playlists (clusters) are there?**")
print("The number of playlists (clusters) is determined dynamically based on the data using the silhouette score.\n")

print("**What audio features did you use and what did you drop? Why?**")
print("We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics.")
print("We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'.\n")

print("**Is the prototype effective at creating cohesive playlists?**")
print("Cohesiveness of playlists can be evaluated based on diversity and user feedback.\n")

print("**Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?**")
print("This can be determined through user feedback and comparison with manually curated playlists.\n")

print("**What kind of data might help us create better playlists?**")
print("Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality.\n")

print("**Is K-Means a good method for creating playlists? Provide pros and cons.**")
print("Pros:")
print("- Simple and easy to implement.")
print("- Scalable to large datasets.")
print("\nCons:")
print("- Assumes clusters are spherical and of equal size.")
print("- Sensitive to initialization.\n")

print("**What would be your next steps if you continued with this project?**")
print("Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.")
**Playlist Recommendations and Diversity Scores:**
Name Diversity
0 Playlist 1: Spawn of Flesh ... 0.999342
1 Playlist 2: Ui! (Voce Inventa) ... 0.991936
**Visualizations:**
**Answers to Questions:**

**How did you create your prototype?**
The prototype was created using Python with the scikit-learn library for machine learning algorithms.

**How many playlists (clusters) are there?**
The number of playlists (clusters) is determined dynamically based on the data using the silhouette score.

**What audio features did you use and what did you drop? Why?**
We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics.
We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'.

**Is the prototype effective at creating cohesive playlists?**
Cohesiveness of playlists can be evaluated based on diversity and user feedback.

**Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?**
This can be determined through user feedback and comparison with manually curated playlists.

**What kind of data might help us create better playlists?**
Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality.

**Is K-Means a good method for creating playlists? Provide pros and cons.**
Pros:
- Simple and easy to implement.
- Scalable to large datasets.

Cons:
- Assumes clusters are spherical and of equal size.
- Sensitive to initialization.

**What would be your next steps if you continued with this project?**
Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.
In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis on the 'name' column and handle NaNs
spotify5k_df['sentiment_score'] = spotify5k_df['name'].fillna('').apply(lambda x: sia.polarity_scores(x)['compound'])

# Fit a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(spotify5k_df['name'])

# Find optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, cluster_labels))

optimal_clusters_silhouette = silhouette_scores.index(max(silhouette_scores)) + 2

# Apply PCA to visualize clusters
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())

# Find optimal number of clusters using PCA
inertia = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_pca)
    inertia.append(kmeans.inertia_)

optimal_clusters_pca = inertia.index(min(inertia)) + 2

# Cluster using KMeans with optimal number of clusters
kmeans_silhouette = KMeans(n_clusters=optimal_clusters_silhouette, random_state=42)
spotify5k_df['cluster_silhouette'] = kmeans_silhouette.fit_predict(X)

kmeans_pca = KMeans(n_clusters=optimal_clusters_pca, random_state=42)
spotify5k_df['cluster_pca'] = kmeans_pca.fit_predict(X)

# Define cluster names based on sentiment
sentiment_cluster_names = {
    0: 'Negative (Low Sentiment)',
    1: 'Neutral (Medium Sentiment)',
    2: 'Positive (High Sentiment)'
}

# Assign cluster names
spotify5k_df['cluster_silhouette'] = spotify5k_df['cluster_silhouette'].map(sentiment_cluster_names)
spotify5k_df['cluster_pca'] = spotify5k_df['cluster_pca'].map(sentiment_cluster_names)

# Explanation for choosing the number of clusters
explanation = f"The number of clusters chosen based on silhouette score: {optimal_clusters_silhouette}. " \
              f"The number of clusters chosen based on PCA: {optimal_clusters_pca}."

# Visualize clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=spotify5k_df['cluster_pca'], palette='husl', legend='full', marker='o')
plt.title('PCA Visualization of Song Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()

# Visualize clusters using Silhouette Score
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(2, 11), y=silhouette_scores, marker='o', color='blue')
plt.title('Silhouette Score for Optimal Cluster Selection')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(False)
plt.xticks(range(2, 11))
plt.show()

# Additional visualizations
plt.figure(figsize=(10, 6))
sns.histplot(data=spotify5k_df, x='sentiment_score', bins=30, kde=True, color='green')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', palette='husl')
plt.title('Number of Songs in Each Sentiment Cluster')
plt.xlabel('Sentiment Cluster')
plt.ylabel('Count')
plt.grid(False)
plt.show()

# Output DataFrame with cluster assignments and remove NaNs
cluster_silhouette_df = spotify5k_df[['name', 'cluster_silhouette']].dropna()
cluster_pca_df = spotify5k_df[['name', 'cluster_pca']].dropna()

# Output sentiment score for each cluster
sentiment_scores = spotify5k_df.groupby('cluster_pca')['sentiment_score'].mean()

# Explanation for cluster names
cluster_name_explanation = f"Cluster names are based on sentiment score: " \
                           f"Negative (Low Sentiment): sentiment score < 0, " \
                           f"Neutral (Medium Sentiment): sentiment score ≈ 0, " \
                           f"Positive (High Sentiment): sentiment score > 0."

# Display the outputs
print(explanation)
print(cluster_name_explanation)
display(cluster_silhouette_df)
display(cluster_pca_df)
display(sentiment_scores)
The number of clusters chosen based on silhouette score: 10. The number of clusters chosen based on PCA: 10.
Cluster names are based on sentiment score: Negative (Low Sentiment): sentiment score < 0, Neutral (Medium Sentiment): sentiment score ≈ 0, Positive (High Sentiment): sentiment score > 0.
name cluster_silhouette
10 The Girl From Ipanema ... Positive (High Sentiment)
18 Aquarela Do Brasil ... Negative (Low Sentiment)
21 Aquarela Do Brasil ... Negative (Low Sentiment)
23 Don't Stop the Carnival ... Positive (High Sentiment)
28 Aquarela Do Brasil ... Negative (Low Sentiment)
... ... ...
5197 The Unanswered Question ... Positive (High Sentiment)
5201 The Planets - Suite for large orchestra, Op.32... Positive (High Sentiment)
5213 The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... Positive (High Sentiment)
5226 Pines Of Rome, P. 141: 3. The Pines Of The Jan... Positive (High Sentiment)
5229 A Flock Descends Into The Pentagonal Garden ... Positive (High Sentiment)

621 rows × 2 columns

name cluster_pca
10 The Girl From Ipanema ... Positive (High Sentiment)
18 Aquarela Do Brasil ... Negative (Low Sentiment)
21 Aquarela Do Brasil ... Negative (Low Sentiment)
23 Don't Stop the Carnival ... Positive (High Sentiment)
28 Aquarela Do Brasil ... Negative (Low Sentiment)
... ... ...
5197 The Unanswered Question ... Positive (High Sentiment)
5201 The Planets - Suite for large orchestra, Op.32... Positive (High Sentiment)
5213 The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... Positive (High Sentiment)
5226 Pines Of Rome, P. 141: 3. The Pines Of The Jan... Positive (High Sentiment)
5229 A Flock Descends Into The Pentagonal Garden ... Positive (High Sentiment)

621 rows × 2 columns

cluster_pca
Negative (Low Sentiment)      0.114416
Neutral (Medium Sentiment)    0.072395
Positive (High Sentiment)    -0.044409
Name: sentiment_score, dtype: float64
In [15]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis on the objective columns and handle NaNs
objective_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                     'speechiness', 'acousticness', 'instrumentalness', 
                     'liveness', 'valence', 'tempo']

spotify5k_df['objective_sentiment_score'] = spotify5k_df[objective_columns].fillna('').apply(
    lambda x: sia.polarity_scores(str(x))['compound'])

# Fit a PCA to visualize objective clusters
X_objective = spotify5k_df[objective_columns].fillna(0)  # Fill NaNs with 0 for PCA
pca_objective = PCA(n_components=2, random_state=42)
X_pca_objective = pca_objective.fit_transform(X_objective)

# Cluster using KMeans with optimal number of clusters
kmeans_objective = KMeans(n_clusters=3, random_state=42)
spotify5k_df['objective_cluster'] = kmeans_objective.fit_predict(X_objective)

# Define cluster names based on objective sentiment
objective_cluster_names = {
    0: 'Low',
    1: 'Medium',
    2: 'High'
}

# Assign cluster names
spotify5k_df['objective_cluster'] = spotify5k_df['objective_cluster'].map(objective_cluster_names)

# Explanation for objective sentiment analysis
objective_explanation = "Objective sentiment analysis was performed based on the following columns: " \
                        "'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', " \
                        "'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. " \
                        "Three clusters were chosen for better interpretation: Low, Medium, and High."

# Visualize objective clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca_objective[:, 0], y=X_pca_objective[:, 1], 
                hue=spotify5k_df['objective_cluster'], palette='husl', 
                legend='full', marker='o')
plt.title('PCA Visualization of Objective Sentiment Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()

# Compare with sentiment analysis based on 'name' column
plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', hue='objective_cluster', palette='husl')
plt.title('Comparison of Sentiment Clusters (Name vs Objective Columns)')
plt.xlabel('Sentiment Cluster (Name)')
plt.ylabel('Count')
plt.legend(title='Objective Cluster', loc='upper right')
plt.grid(False)
plt.show()

# Display the outputs
display(objective_explanation)
display(spotify5k_df[['name', 'objective_cluster']])
"Objective sentiment analysis was performed based on the following columns: 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. Three clusters were chosen for better interpretation: Low, Medium, and High."
name objective_cluster
0 Se Eu Quiser Falar Com Deus ... Medium
1 Saudade De Bahia ... Medium
2 Canta Canta, Minha Gente ... Low
3 Mulher Eu Sei ... Low
4 Rosa Morena ... Medium
... ... ...
5230 1812 Festival Overture, Op. 49: 1812 Overture,... Low
5231 Winter Fragments pour ensemble instrumental, s... Low
5232 Schoenberg: 5 Orchestral Pieces, Op. 16: No. 3... Low
5233 Serenade For Strings In E, Op.22, B. 52: 1. Mo... Low
5234 Ravel: Boléro, M. 81 ... Low

5235 rows × 2 columns

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo']

# Explanation: These features are chosen because they represent different aspects of songs
print("Selected Features for Clustering:")
print(pd.DataFrame(features, columns=['Features']))

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(spotify5k_df[features])

# Explanation: StandardScaler is used to standardize the features, ensuring each feature has a mean of 0 and a standard deviation of 1.
print("\nFeature Scaling:")
print("The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.")

# Create a DataFrame of scaled features for visualization
scaled_df = pd.DataFrame(X_scaled, columns=features)

# Plot histograms of scaled features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(scaled_df[feature], bins=20, color='skyblue', edgecolor='black')
    plt.title(feature)
    plt.xlabel('Scaled Values')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Use PCA to reduce dimensionality
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X_scaled)

# Explanation: PCA is used to reduce the dimensionality of the data while retaining most of its variance.
print("\nDimensionality Reduction with PCA:")
print("Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.")

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(features) + 1), pca.explained_variance_ratio_, marker='o', linestyle='--', color='b')
plt.title('Explained Variance Ratio by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.xticks(np.arange(1, len(features) + 1))
plt.grid(False)
plt.show()

# Find optimal number of clusters using silhouette score
silhouette_scores = []
for k in range(2, 101):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 101), silhouette_scores, marker='o', linestyle='-', color='r')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(np.arange(2, 101, 5))
plt.grid(False)
plt.show()

# Based on business requirements, choose the number of clusters
# We aim to create playlists with sizes between 50 and 250 songs
# So, let's explore having between 20 and 100 clusters
print("\nChoosing Number of Clusters:")
print("Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs.")
print("We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.")
Selected Features for Clustering:
           Features
0      danceability
1            energy
2          loudness
3       speechiness
4      acousticness
5  instrumentalness
6          liveness
7           valence
8             tempo

Feature Scaling:
The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.
Dimensionality Reduction with PCA:
Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.
Choosing Number of Clusters:
Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs.
We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.
In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Step 1: Determine the optimal number of clusters
# Calculate silhouette scores for different numbers of clusters
silhouette_scores = []
for k in range(2, 101):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 101), silhouette_scores, marker='o', linestyle='-', color='limegreen')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(np.arange(2, 101, 5))
plt.grid(False)
plt.show()

# Step 2: Perform K-means clustering with the chosen number of clusters
# Based on business requirements, let's choose the number of clusters
# We aim to create playlists with sizes between 50 and 250 songs
# So, let's explore having between 20 and 100 clusters
chosen_clusters = 50

# Perform K-means clustering
kmeans = KMeans(n_clusters=chosen_clusters, random_state=42)
kmeans.fit(X_pca)

# Assign songs to clusters
cluster_labels = kmeans.labels_

# Step 3: Analyze characteristics of each cluster and create playlists
# Visualize cluster sizes
plt.figure(figsize=(10, 6))
plt.hist(cluster_labels, bins=chosen_clusters, color='gold', edgecolor='black')
plt.title('Distribution of Songs Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Number of Songs')
plt.grid(False)
plt.show()

# Analyze characteristics of each cluster
cluster_centers = scaler.inverse_transform(pca.inverse_transform(kmeans.cluster_centers_))
cluster_df = pd.DataFrame(cluster_centers, columns=features)

# Display cluster characteristics
print("\nCluster Characteristics:")
print(cluster_df)

# Create playlists based on clusters
playlist_sizes = [50, 100, 150, 200, 250]
playlists = []

for size in playlist_sizes:
    playlist = []
    for i in range(chosen_clusters):
        cluster_indices = np.where(cluster_labels == i)[0]
        cluster_indices = np.random.choice(cluster_indices, min(size // chosen_clusters, len(cluster_indices)), replace=False)
        playlist.extend(cluster_indices)
    playlists.append(playlist)

# Visualize playlist sizes
plt.figure(figsize=(10, 6))
plt.bar(range(len(playlist_sizes)), [len(p) for p in playlists], color='skyblue', edgecolor='black')
plt.xticks(range(len(playlist_sizes)), [f"{size} Songs" for size in playlist_sizes])
plt.title('Playlist Sizes')
plt.xlabel('Playlist Size')
plt.ylabel('Number of Songs')
plt.grid(False)
plt.show()

# Final Output: A data-driven analysis on clustering music data to create playlists
print("\nData-Driven Playlist Creation:")
print("By applying K-means clustering to music data, we identified distinct clusters of songs.")
print("Each cluster represents songs with similar characteristics, allowing us to create diverse playlists.")
print("Our analysis ensures that each playlist falls within the desired size range, catering to various music preferences.")
Cluster Characteristics:
    danceability    energy   loudness  speechiness  acousticness  \
0       0.638241  0.543509  -9.420741     0.049454      0.217128   
1       0.255956  0.953867  -6.552889     0.294956      0.018788   
2       0.237559  0.134845 -20.194955     0.042655      0.918593   
3       0.384051  0.784051  -5.999276     0.060602      0.065965   
4       0.255372  0.918642  -8.244533     0.100722      0.009430   
5       0.722199  0.822593  -5.170907     0.054779      0.084628   
6       0.530379  0.585765  -7.275750     0.038811      0.118160   
7       0.512322  0.189918 -14.514575     0.042064      0.862069   
8       0.611485  0.796667  -6.480545     0.260727      0.164447   
9       0.236984  0.953076  -6.970414     0.138273      0.004265   
10      0.776301  0.662301  -6.842019     0.290340      0.195811   
11      0.571887  0.505642 -10.295491     0.060089      0.609679   
12      0.688940  0.796894  -6.125384     0.051253      0.148324   
13      0.649836  0.729164  -5.785616     0.197658      0.223944   
14      0.261514  0.158233 -19.581289     0.044287      0.927461   
15      0.287964  0.944371  -6.620429     0.117826      0.007488   
16      0.548741  0.614519  -8.860247     0.052907      0.608185   
17      0.422893  0.152813 -17.881270     0.044634      0.935393   
18      0.522704  0.494800  -8.605872     0.037519      0.607096   
19      0.256256  0.899957  -7.829686     0.084520      0.022247   
20      0.517682  0.815305  -6.025156     0.051307      0.079856   
21      0.276156  0.947701  -6.949583     0.130681      0.005310   
22      0.312911  0.050768 -24.245583     0.048975      0.932583   
23      0.729367  0.652751  -6.501610     0.050899      0.102056   
24      0.568820  0.809036  -5.338006     0.057978      0.078530   
25      0.789089  0.754093  -5.462509     0.073350      0.157751   
26      0.767610  0.787052  -5.132234     0.171831      0.174536   
27      0.646381  0.615952  -7.402762     0.462476      0.204789   
28      0.189143  0.923218  -8.187103     0.114483      0.000162   
29      0.326286  0.176659 -21.674727     0.056445      0.926273   
30      0.645920  0.284357 -14.288159     0.056819      0.788129   
31      0.653902  0.835561  -5.648622     0.064650      0.053876   
32      0.256405  0.021973 -32.966220     0.047760      0.963209   
33      0.569960  0.752840  -5.628180     0.355360      0.128330   
34      0.260148  0.872615  -9.842303     0.080007      0.005975   
35      0.321013  0.203286 -14.384658     0.042971      0.911763   
36      0.617460  0.784080  -7.638440     0.051896      0.099935   
37      0.674254  0.611690  -8.368063     0.054069      0.655135   
38      0.595532  0.806073  -5.977532     0.067370      0.122243   
39      0.277600  0.835827 -10.160320     0.070545      0.014217   
40      0.325106  0.046042 -28.753576     0.046194      0.966515   
41      0.551570  0.410289 -10.231307     0.043446      0.640237   
42      0.550944  0.861622  -5.162889     0.072799      0.066933   
43      0.699763  0.753553  -5.486289     0.057467      0.108573   
44      0.272695  0.927924  -6.761848     0.108619      0.014219   
45      0.599190  0.345586 -14.190603     0.048495      0.811655   
46      0.735399  0.536135 -10.632020     0.046663      0.192130   
47      0.762082  0.725300  -5.641491     0.184091      0.190541   
48      0.325622  0.265924 -14.424222     0.046704      0.779556   
49      0.601290  0.846842  -4.467612     0.062983      0.049764   

    instrumentalness  liveness   valence       tempo  
0           0.012862  0.119604  0.610937   91.825902  
1           0.391098  0.157518  0.089462  115.284311  
2           0.857511  0.125135  0.103664   77.182169  
3           0.057124  0.170864  0.418495  170.613582  
4           0.844095  0.350912  0.204377  111.275489  
5           0.017626  0.100758  0.812671  127.091134  
6           0.018348  0.123937  0.224861  116.722008  
7           0.043116  0.126871  0.268154  122.666184  
8           0.027714  0.708424  0.510800  126.642636  
9           0.787025  0.128012  0.143622  100.997127  
10          0.001562  0.139392  0.662738   98.252214  
11          0.022195  0.706925  0.617623  112.332472  
12          0.010750  0.366053  0.801212  120.065166  
13          0.016701  0.148638  0.656507  167.550219  
14          0.846895  0.135797  0.154867  158.944316  
15          0.090382  0.116717  0.190138   99.038614  
16          0.025143  0.147107  0.826025  159.414086  
17          0.871393  0.118960  0.160820  115.211041  
18          0.018188  0.146095  0.423475  132.683832  
19          0.731383  0.380600  0.310424  160.550414  
20          0.043056  0.114947  0.558883  108.087292  
21          0.100312  0.165424  0.168013  143.009142  
22          0.071856  0.136039  0.163064  101.235722  
23          0.017560  0.119760  0.489910  124.812768  
24          0.032863  0.350497  0.390683  130.595293  
25          0.004165  0.107227  0.772065   97.149201  
26          0.020615  0.321896  0.665247  103.065013  
27          0.006536  0.257362  0.502857  110.747524  
28          0.807092  0.123032  0.201592  168.943759  
29          0.497099  0.697045  0.223895  104.050136  
30          0.058963  0.138916  0.624248  111.711876  
31          0.749402  0.165323  0.291985  126.448341  
32          0.874901  0.100595  0.112177   73.572308  
33          0.018522  0.172696  0.531614  176.514880  
34          0.816795  0.128945  0.259733  135.793615  
35          0.042932  0.173722  0.199078   80.435184  
36          0.735660  0.168884  0.805620  136.681920  
37          0.026929  0.164922  0.820841  105.576421  
38          0.022642  0.133181  0.779725  166.843037  
39          0.780080  0.148558  0.317591   97.050187  
40          0.898045  0.106336  0.182614  134.855894  
41          0.025229  0.139161  0.405124   89.198860  
42          0.125006  0.709822  0.458938  123.790489  
43          0.008560  0.289750  0.478816   96.137842  
44          0.166406  0.341876  0.236993  105.341238  
45          0.823052  0.141048  0.537431  110.537931  
46          0.014568  0.106582  0.848595  119.276284  
47          0.004509  0.103830  0.610709   99.563218  
48          0.076807  0.162613  0.327256  175.201756  
49          0.032006  0.121711  0.307443  129.230525  
Data-Driven Playlist Creation:
By applying K-means clustering to music data, we identified distinct clusters of songs.
Each cluster represents songs with similar characteristics, allowing us to create diverse playlists.
Our analysis ensures that each playlist falls within the desired size range, catering to various music preferences.
In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Selecting columns for clustering
columns_for_clustering = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                          'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                          'duration_ms']

# Scaling the selected columns
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(spotify5k_df[columns_for_clustering])

# Performing K-means clustering with 50 clusters
kmeans = KMeans(n_clusters=50, random_state=42)
spotify5k_df['playlist_cluster'] = kmeans.fit_predict(scaled_features)

# Creating a mapping dictionary to map cluster labels to playlist names
playlist_names = [
    "Relaxing Vibes", "Energetic Workout", "Chill Lounge", "Happy Beats", "Soothing Melodies",
    "Motivational Mix", "Groovy Tunes", "Nighttime Chill", "Summer Fun", "Cozy Fireplace",
    "Mellow Grooves", "High Tempo", "Zen Garden", "Romantic Serenade", "Sunny Day", "Late Night Jazz",
    "Dreamy Atmosphere", "Pump-up Party", "Rainy Day", "Feel-good Funk", "Island Escapade",
    "Classic Rock", "Jazzy Brunch", "Mindful Meditation", "Epic Soundtrack", "Stress Relief",
    "Salsa Fiesta", "Soulful R&B", "Electronic Dance", "Country Roads", "Urban Vibes",
    "Smooth Jazz", "Indie Discovery", "Reggae Vibes", "Guitar Strumming", "Vintage Classics",
    "Latin Fever", "Deep House", "Piano Reflections", "Throwback Hits", "Alternative Edge",
    "Calm Waters", "Disco Fever", "Motown Magic", "Folk Fusion", "Hip-hop Groove",
    "Classical Symphony", "Techno Beats", "Opera Night", "Ambient Bliss"
]

playlist_mapping = {i: name for i, name in enumerate(playlist_names)}

# Mapping cluster labels to playlist names
spotify5k_df['playlist_name'] = spotify5k_df['playlist_cluster'].map(playlist_mapping)

# Define score ranges for each playlist type
mood_range = range(0, 100)
emotion_range = range(100, 200)
activity_range = range(200, max(playlist_summary['score'])+1)

# Assign playlist_type based on score
playlist_summary['playlist_type'] = ''
for idx, row in playlist_summary.iterrows():
    if row['score'] in mood_range:
        playlist_summary.at[idx, 'playlist_type'] = 'Mood'
    elif row['score'] in emotion_range:
        playlist_summary.at[idx, 'playlist_type'] = 'Emotion'
    else:
        playlist_summary.at[idx, 'playlist_type'] = 'Activity'
playlist_summary.sort_values(by=['score', 'playlist_type', 'name'], ascending=[False, True, True])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 43
     41 mood_range = range(0, 100)
     42 emotion_range = range(100, 200)
---> 43 activity_range = range(200, max(playlist_summary['score'])+1)
     45 # Assign playlist_type based on score
     46 playlist_summary['playlist_type'] = ''

NameError: name 'playlist_summary' is not defined